# Load geocoding output from ArcGIS and resave as a smaller file
pacman::p_load(data.table, tidyverse, sf, cowplot, kableExtra, modelsummary, scales, fst, tinytable, pbmcapply)

rm(list = ls())
source("code/globals.R")

# Load raw geocoding data from ESRI ArcGIS Geocoding Service
statewide_geocoded_whazard <- readRDS(file.path(INPUT_PRIVATE, "ESRI/CAsampleforgeocoding_Geocoded_Addr_2024_03_26.RDS"))
statewide_geocoded_nohazard <- readRDS(file.path(INPUT_PRIVATE, "ESRI/CAsampleforgeocodingnohazard_Geocoded_Addr_2024_03_26.RDS"))

# Limit fields
setDT(statewide_geocoded_whazard)
setDT(statewide_geocoded_nohazard)
statewide_geocoded_whazard <- statewide_geocoded_whazard[, .(ImportParcelID = USER_ImportParcelID, Subregion, RegionAbbr, USER_County,
                                                             BuildingOrImprovementNumber = USER_BuildingOrImprovementNumber,
                                                             lon_ESRI = X, lat_ESRI = Y)]

statewide_geocoded_nohazard <- statewide_geocoded_nohazard[, .(ImportParcelID = USER_ImportParcelID, Subregion, RegionAbbr, USER_County,
                                                               BuildingOrImprovementNumber = USER_BuildingOrImprovementNumber,
                                                               lon_ESRI = X, lat_ESRI = Y)]

statewide_geocoded <- bind_rows(statewide_geocoded_whazard, statewide_geocoded_nohazard)

# Remove duplicates by ImportParcelID, BuildingOrImprovementNumber
statewide_geocoded <- statewide_geocoded %>%
  distinct(ImportParcelID, BuildingOrImprovementNumber, .keep_all = T)

# Drop any obviously wrong based on lat/lon and region
statewide_geocoded <- statewide_geocoded %>%
  filter(lon_ESRI < 0.01 & lat_ESRI > 0.01 & RegionAbbr == "CA")

# Only keep locations where inputted county matches geocoded county
clean_county_names <- function(x) {
  x <- toupper(x)
  x <- sub("CITY AND COUNTY OF", "", x)
  x <- sub("COUNTY", "", x)
  x <- trimws(x)
  x
}

setDT(statewide_geocoded)
statewide_geocoded[, `:=`(
  Subregion = clean_county_names(Subregion),
  USER_County = clean_county_names(USER_County)
)]

statewide_geocoded <- statewide_geocoded[Subregion == USER_County]

# Limit fields to the bare minimum
statewide_geocoded <- statewide_geocoded[, .(ImportParcelID, BuildingOrImprovementNumber, lon_ESRI, lat_ESRI)]

# Save for use in subsequent scripts
write_fst(statewide_geocoded, file.path(WORKING, "statewide-geocoded.fst"))
